Glasgow Concreteness Norms

Load data

### setwd("/Users/seantrott/Dropbox/UCSD/Research/NLMs/llm_clt/src/analysis/")

### Read in all data
df_all_results = read_csv("../../data/processed/gc_results.csv")
## Rows: 33214 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): k, combo_index, list_num, n, pearson_centaur1, spearman_centaur1, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_all_results)
## [1] 33214
### How many per list?
table(df_all_results$list_num)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  126 1022 2046  510 2046 3086 2046 3086 3086  510 2046 3086 3086 3086 2046  254 
##   17 
## 2046
### LLM data
df_llm = read_csv("../../data/processed/gc_llm_corrs.csv")
## Rows: 17 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): list_num, n, pearson_llm, spearman_llm
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_llm %>%
  summarise(m_spearmam = mean(spearman_llm),
            sd_spearman = sd(spearman_llm))
## # A tibble: 1 × 2
##   m_spearmam sd_spearman
##        <dbl>       <dbl>
## 1      0.803      0.0413

Figure 1a

df_individuals = df_all_results %>%
  filter(k == 1)

df_individuals %>%
  ggplot(aes(x = spearman_ppt)) +
  geom_histogram(alpha = .5) +
  labs(x="Correlation with Original Concreteness Norms", y="Count") +
  theme_minimal() +
  geom_vline(xintercept = mean(df_llm$spearman_llm), ### LLM 
              linetype = "dotted", color = "blue", alpha = .8) +
  theme(text = element_text(size = 15))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Figure 1b

### Pivot
df_results_long = df_all_results %>%
  select('spearman_centaur1', 'spearman_centaur2', 'spearman_ppt',
         'k', 'combo_index', 'list_num') %>%
  pivot_longer(cols = c('spearman_centaur1', 'spearman_centaur2', 'spearman_ppt'),
               names_to = "sample_type",
               values_to = "correlation") %>%
  mutate(sample_type = sub("spearman_", "", sample_type)) %>%
  mutate(sample_type = ifelse(
    sample_type == "ppt", "Human", sample_type
  ))


### Visualize
df_results_summ = df_results_long %>%
  group_by(k, sample_type) %>%
  summarize(m_corr = mean(correlation),
            sd_corr = sd(correlation),
            se_corr = sd(correlation)/sqrt(n()))  %>%
  mutate(sample_type = toTitleCase(sample_type)) 
## `summarise()` has grouped output by 'k'. You can override using the `.groups`
## argument.
df_results_summ %>%
  ggplot(aes(x = k, y = m_corr)) +
  geom_point(aes(color=factor(sample_type), shape = factor(sample_type)), size=3, alpha = .5) +  # Add points
  geom_line(aes(color=factor(sample_type))) +  # Connect points with lines
  geom_errorbar(aes(ymin=m_corr-se_corr * 2, ymax=m_corr+se_corr * 2, width=0.2, color = factor(sample_type))) + 
  labs(x="Number of Participants", y="Correlation with Concreteness Norms", color = "Sample Type", shape = "Sample Type") +
  theme_minimal() +
  geom_hline(yintercept = mean(df_llm$spearman_llm), ### LLM 
              linetype = "dotted", color = "blue",
            alpha = .8) +
  theme(text = element_text(size = 15),
        legend.position="bottom") +
scale_color_manual(values = my_colors)

Figure 1c

Here, we visualize projected differences in quality and cost of a human sample vs. GPT-4.

Setting up cost assumptions

For human cost, we assume:

  • Rate of $12 an hour.
  • Approximately 5 seconds per judgment.
  • Approximately ~720 judgments per hour.

For GPT-4 cost, we assume:

  • $0.06 per 1000 generated tokens.
  • $0.03 per 1000 sampled tokens.
  • Approximately 20 sampled tokens per judgment.
  • Approximately 10 generated tokens per judgment.
### Human assumptions
RATE = 12
SECONDS_PER_JUDGMENT = 5

HUMAN_CPJ = RATE / (3600/SECONDS_PER_JUDGMENT)

### GPT-4 assumptions
COST_PER_1K_SAMPLED = 0.0003
COST_PER_1K_GENERATED = 0.0006
NUM_SAMPLED_PER_JUDGMENT = 20
NUM_GENERATED_PER_JUDGMENT = 10

GPT_CPJ = (NUM_SAMPLED_PER_JUDGMENT / 1000) * COST_PER_1K_SAMPLED + 1000 * (NUM_GENERATED_PER_JUDGMENT / 1000) * COST_PER_1K_GENERATED

Visualizing

### Visualize

df_costs_summ = df_all_results %>%
  mutate(ratio_human_quality = spearman_ppt / spearman_llm) %>%
  mutate(ratio_human_cost = (k * HUMAN_CPJ) / GPT_CPJ) %>%
  mutate(ratio_centaur1_quality = spearman_centaur1 / spearman_llm) %>%
  mutate(ratio_centaur1_cost = (GPT_CPJ + k * HUMAN_CPJ) / GPT_CPJ) %>%
  mutate(ratio_centaur2_quality = spearman_centaur2 / spearman_llm) %>%
  mutate(ratio_centaur2_cost = (GPT_CPJ + k * HUMAN_CPJ) / GPT_CPJ) %>%
  group_by(k) %>%
  summarise(m_human_ratio_quality = mean(ratio_human_quality),
            se_human_ratio_quality = sd(ratio_human_quality)/sqrt(n()),
            m_human_ratio_cost = mean(ratio_human_cost),
            se_human_ratio_cost = sd(ratio_human_cost),
            ### Centaur1
            m_centaur1_ratio_quality = mean(ratio_centaur1_quality),
            se_centaur1_ratio_quality = sd(ratio_centaur1_quality)/sqrt(n()),
            m_centaur1_ratio_cost = mean(ratio_centaur1_cost),
            se_centaur1_ratio_cost = sd(ratio_centaur1_cost),
            ### Centaur2
            m_centaur2_ratio_quality = mean(ratio_centaur2_quality),
            se_centaur2_ratio_quality = sd(ratio_centaur2_quality)/sqrt(n()),
            m_centaur2_ratio_cost = mean(ratio_centaur2_cost),
            se_centaur2_ratio_cost = sd(ratio_centaur2_cost))

df_costs_summ_long <- df_costs_summ %>%
  pivot_longer(
    cols = -k,
    names_to = "variable",
    values_to = "value"
  ) %>%
  mutate(
    sample_type = str_extract(variable, "human|centaur1|centaur2"),
    metric = case_when(
      str_detect(variable, "m_.*_quality") ~ "m_quality",
      str_detect(variable, "se_.*_quality") ~ "se_quality",
      str_detect(variable, "m_.*_cost") ~ "m_cost",
      str_detect(variable, "se_.*_cost") ~ "se_cost"
    )
  ) %>%
  select(-variable) %>%
  pivot_wider(
    names_from = metric,
    values_from = value
  ) %>%
  mutate(sample_type = toTitleCase(sample_type))


df_costs_summ_long %>%
  ggplot(aes(x = m_quality, y = m_cost, color = sample_type, shape = sample_type)) +
  geom_point(size=3, alpha = .5) +  # Add points
  geom_line(alpha = .6) +  # Connect points with lines
  labs(x="Quality Ratio", y="Cost Ratio",
       color = "Sample Type", shape = "Sample Type") +
  theme_minimal() +
  geom_vline(xintercept = 1, linetype = "dotted") +
  theme(text = element_text(size = 15),
        legend.position="bottom") +
  scale_color_manual(values = my_colors)

RAW-C Norms

Load data

### Read in all data
df_all_results = read_csv("../../data/processed/rawc_results.csv")
## Rows: 21340 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): k, combo_index, list_num, n, pearson_centaur1, spearman_centaur1, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_all_results)
## [1] 21340
### How many per list?
table(df_all_results$list_num)
## 
##    1    2    3    4    5    6    7    8 
## 3086 3754  510 4438 1022 3754 3754 1022
### LLM data
df_llm = read_csv("../../data/processed/rawc_llm_corrs.csv")
## Rows: 8 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): list_num, n, pearson_llm, spearman_llm
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_llm %>%
  summarise(m_spearmam = mean(spearman_llm),
            sd_spearman = sd(spearman_llm))
## # A tibble: 1 × 2
##   m_spearmam sd_spearman
##        <dbl>       <dbl>
## 1      0.804      0.0205

Figure 1d

df_individuals = df_all_results %>%
  filter(k == 1)

mean(df_individuals$spearman_ppt)
## [1] 0.7762202
sd(df_individuals$spearman_ppt)
## [1] 0.103045
df_individuals %>%
  ggplot(aes(x = spearman_ppt)) +
  geom_histogram(alpha = .5) +
  labs(x="Correlation with Original RAW-C Norms", y="Count") +
  theme_minimal() +
  geom_vline(xintercept = mean(df_llm$spearman_llm), ### LLM 
              linetype = "dotted", color = "blue",
             size = 1.2, alpha = .5) +
  theme(text = element_text(size = 15))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Figure 1e

### Pivot
df_results_long = df_all_results %>%
  select('spearman_centaur1', 'spearman_centaur2', 'spearman_ppt',
         'k', 'combo_index', 'list_num') %>%
  pivot_longer(cols = c('spearman_centaur1', 'spearman_centaur2', 'spearman_ppt'),
               names_to = "sample_type",
               values_to = "correlation") %>%
  mutate(sample_type = sub("spearman_", "", sample_type)) %>%
  mutate(sample_type = ifelse(
    sample_type == "ppt", "Human", sample_type
  ))


### Visualize
df_results_summ = df_results_long %>%
  group_by(k, sample_type) %>%
  summarize(m_corr = mean(correlation),
            sd_corr = sd(correlation),
            se_corr = sd(correlation)/sqrt(n()))  %>%
  mutate(sample_type = toTitleCase(sample_type)) 
## `summarise()` has grouped output by 'k'. You can override using the `.groups`
## argument.
df_results_summ %>%
  ggplot(aes(x = k, y = m_corr)) +
  geom_point(aes(color=factor(sample_type),shape = factor(sample_type)), size =3, alpha = .6) +  # Add points
  geom_line(aes(color=factor(sample_type)), alpha = .5) +  # Connect points with lines
  geom_errorbar(aes(ymin=m_corr-se_corr * 2, ymax=m_corr+se_corr * 2, width=0.2, color = factor(sample_type)), alpha = .5) + 
  labs(x="Number of Participants", y="Spearman's Rho", color = "Sample Type",
       shape = "Sample Type") +
  theme_minimal() +
  geom_hline(yintercept = mean(df_llm$spearman_llm), ### LLM 
              linetype = "dotted", color = "blue",
             size = 1.2, alpha = .5) +
  theme(text = element_text(size = 15),
        legend.position="bottom")+
  scale_color_manual(values = my_colors)

Figure 1f

Setting up cost assumptions

For human cost, we assume:

  • Rate of $12 an hour.
  • Approximately 5 seconds per judgment.
  • Approximately ~720 judgments per hour.

For GPT-4 cost, we assume:

  • $0.06 per 1000 generated tokens.
  • $0.03 per 1000 sampled tokens.
  • Approximately 20 sampled tokens per judgment.
  • Approximately 10 generated tokens per judgment.
### Human assumptions
RATE = 12
SECONDS_PER_JUDGMENT = 5

HUMAN_CPJ = RATE / (3600/SECONDS_PER_JUDGMENT)

### GPT-4 assumptions
COST_PER_1K_SAMPLED = 0.0003
COST_PER_1K_GENERATED = 0.0006
NUM_SAMPLED_PER_JUDGMENT = 20
NUM_GENERATED_PER_JUDGMENT = 10

GPT_CPJ = (NUM_SAMPLED_PER_JUDGMENT / 1000) * COST_PER_1K_SAMPLED + 1000 * (NUM_GENERATED_PER_JUDGMENT / 1000) * COST_PER_1K_GENERATED

Visualizing

### Visualize

df_costs_summ = df_all_results %>%
  mutate(ratio_human_quality = spearman_ppt / spearman_llm) %>%
  mutate(ratio_human_cost = (k * HUMAN_CPJ) / GPT_CPJ) %>%
  mutate(ratio_centaur1_quality = spearman_centaur1 / spearman_llm) %>%
  mutate(ratio_centaur1_cost = (GPT_CPJ + k * HUMAN_CPJ) / GPT_CPJ) %>%
  mutate(ratio_centaur2_quality = spearman_centaur2 / spearman_llm) %>%
  mutate(ratio_centaur2_cost = (GPT_CPJ + k * HUMAN_CPJ) / GPT_CPJ) %>%
  group_by(k) %>%
  summarise(m_human_ratio_quality = mean(ratio_human_quality),
            se_human_ratio_quality = sd(ratio_human_quality)/sqrt(n()),
            m_human_ratio_cost = mean(ratio_human_cost),
            se_human_ratio_cost = sd(ratio_human_cost),
            ### Centaur1
            m_centaur1_ratio_quality = mean(ratio_centaur1_quality),
            se_centaur1_ratio_quality = sd(ratio_centaur1_quality)/sqrt(n()),
            m_centaur1_ratio_cost = mean(ratio_centaur1_cost),
            se_centaur1_ratio_cost = sd(ratio_centaur1_cost),
            ### Centaur2
            m_centaur2_ratio_quality = mean(ratio_centaur2_quality),
            se_centaur2_ratio_quality = sd(ratio_centaur2_quality)/sqrt(n()),
            m_centaur2_ratio_cost = mean(ratio_centaur2_cost),
            se_centaur2_ratio_cost = sd(ratio_centaur2_cost))

df_costs_summ_long <- df_costs_summ %>%
  pivot_longer(
    cols = -k,
    names_to = "variable",
    values_to = "value"
  ) %>%
  mutate(
    sample_type = str_extract(variable, "human|centaur1|centaur2"),
    metric = case_when(
      str_detect(variable, "m_.*_quality") ~ "m_quality",
      str_detect(variable, "se_.*_quality") ~ "se_quality",
      str_detect(variable, "m_.*_cost") ~ "m_cost",
      str_detect(variable, "se_.*_cost") ~ "se_cost"
    )
  ) %>%
  select(-variable) %>%
  pivot_wider(
    names_from = metric,
    values_from = value
  ) %>%
  mutate(sample_type = toTitleCase(sample_type))

df_costs_summ_long %>%
  ggplot(aes(x = m_quality, y = m_cost, color = sample_type, shape = sample_type)) +
  geom_point(size=3, alpha = .5) +  # Add points
  geom_line(alpha = .6) +  # Connect points with lines
  # geom_smooth(alpha = .2) +
  labs(x="Quality Ratio", y="Cost Ratio",
       color = "Sample Type", shape = "Sample Type") +
  theme_minimal() +
  geom_vline(xintercept = 1, linetype = "dotted") +
  theme(text = element_text(size = 15),
        legend.position="bottom") +
  scale_color_manual(values = my_colors)

Glasgow Valence Norms

Load data

### setwd("/Users/seantrott/Dropbox/UCSD/Research/NLMs/llm_clt/src/analysis/")

### Read in all data
df_all_results = read_csv("../../data/processed/gc_valence_results.csv")
## Rows: 14294 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): k, combo_index, list_num, n, pearson_centaur1, spearman_centaur1, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_all_results)
## [1] 14294
### How many per list?
table(df_all_results$list_num)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##   62  126 2046   62  510 5150 3086  254  254  510  510 1022  126  254   62    6 
##   17 
##  254
### LLM data
df_llm = read_csv("../../data/processed/gc_valence_llm_corrs.csv")
## Rows: 17 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): list_num, n, pearson_llm, spearman_llm
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_llm %>%
  summarise(m_spearmam = mean(spearman_llm),
            sd_spearman = sd(spearman_llm))
## # A tibble: 1 × 2
##   m_spearmam sd_spearman
##        <dbl>       <dbl>
## 1      0.750      0.0678

Figure 1g

df_individuals = df_all_results %>%
  filter(k == 1)

df_individuals %>%
  ggplot(aes(x = spearman_ppt)) +
  geom_histogram(alpha = .5) +
  labs(x="Correlation with Original Valence Norms", y="Count") +
  theme_minimal() +
  geom_vline(xintercept = mean(df_llm$spearman_llm), ### LLM 
              linetype = "dotted", color = "blue", alpha = .8) +
  theme(text = element_text(size = 15))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Figure 1h

### Pivot
df_results_long = df_all_results %>%
  select('spearman_centaur1', 'spearman_centaur2', 'spearman_ppt',
         'k', 'combo_index', 'list_num') %>%
  pivot_longer(cols = c('spearman_centaur1', 'spearman_centaur2', 'spearman_ppt'),
               names_to = "sample_type",
               values_to = "correlation") %>%
  mutate(sample_type = sub("spearman_", "", sample_type)) %>%
  mutate(sample_type = ifelse(
    sample_type == "ppt", "Human", sample_type
  ))


### Visualize
df_results_summ = df_results_long %>%
  group_by(k, sample_type) %>%
  summarize(m_corr = mean(correlation),
            sd_corr = sd(correlation),
            se_corr = sd(correlation)/sqrt(n()))  %>%
  mutate(sample_type = toTitleCase(sample_type)) 
## `summarise()` has grouped output by 'k'. You can override using the `.groups`
## argument.
df_results_summ %>%
  ggplot(aes(x = k, y = m_corr)) +
  geom_point(aes(color=factor(sample_type), shape = factor(sample_type)), size=3, alpha = .5) +  # Add points
  geom_line(aes(color=factor(sample_type))) +  # Connect points with lines
  geom_errorbar(aes(ymin=m_corr-se_corr * 2, ymax=m_corr+se_corr * 2, width=0.2, color = factor(sample_type))) + 
  labs(x="Number of Participants", y="Correlation with Valence Norms", color = "Sample Type", shape = "Sample Type") +
  theme_minimal() +
  geom_hline(yintercept = mean(df_llm$spearman_llm), ### LLM 
              linetype = "dotted", color = "blue",
            alpha = .8) +
  theme(text = element_text(size = 15),
        legend.position="bottom") +
scale_color_manual(values = my_colors)

Figure 1i

Here, we visualize projected differences in quality and cost of a human sample vs. GPT-4.

Setting up cost assumptions

For human cost, we assume:

  • Rate of $12 an hour.
  • Approximately 5 seconds per judgment.
  • Approximately ~720 judgments per hour.

For GPT-4 cost, we assume:

  • $0.06 per 1000 generated tokens.
  • $0.03 per 1000 sampled tokens.
  • Approximately 20 sampled tokens per judgment.
  • Approximately 10 generated tokens per judgment.
### Human assumptions
RATE = 12
SECONDS_PER_JUDGMENT = 5

HUMAN_CPJ = RATE / (3600/SECONDS_PER_JUDGMENT)

### GPT-4 assumptions
COST_PER_1K_SAMPLED = 0.0003
COST_PER_1K_GENERATED = 0.0006
NUM_SAMPLED_PER_JUDGMENT = 20
NUM_GENERATED_PER_JUDGMENT = 10

GPT_CPJ = (NUM_SAMPLED_PER_JUDGMENT / 1000) * COST_PER_1K_SAMPLED + 1000 * (NUM_GENERATED_PER_JUDGMENT / 1000) * COST_PER_1K_GENERATED

Visualizing

### Visualize

df_costs_summ = df_all_results %>%
  mutate(ratio_human_quality = spearman_ppt / spearman_llm) %>%
  mutate(ratio_human_cost = (k * HUMAN_CPJ) / GPT_CPJ) %>%
  mutate(ratio_centaur1_quality = spearman_centaur1 / spearman_llm) %>%
  mutate(ratio_centaur1_cost = (GPT_CPJ + k * HUMAN_CPJ) / GPT_CPJ) %>%
  mutate(ratio_centaur2_quality = spearman_centaur2 / spearman_llm) %>%
  mutate(ratio_centaur2_cost = (GPT_CPJ + k * HUMAN_CPJ) / GPT_CPJ) %>%
  group_by(k) %>%
  summarise(m_human_ratio_quality = mean(ratio_human_quality),
            se_human_ratio_quality = sd(ratio_human_quality)/sqrt(n()),
            m_human_ratio_cost = mean(ratio_human_cost),
            se_human_ratio_cost = sd(ratio_human_cost),
            ### Centaur1
            m_centaur1_ratio_quality = mean(ratio_centaur1_quality),
            se_centaur1_ratio_quality = sd(ratio_centaur1_quality)/sqrt(n()),
            m_centaur1_ratio_cost = mean(ratio_centaur1_cost),
            se_centaur1_ratio_cost = sd(ratio_centaur1_cost),
            ### Centaur2
            m_centaur2_ratio_quality = mean(ratio_centaur2_quality),
            se_centaur2_ratio_quality = sd(ratio_centaur2_quality)/sqrt(n()),
            m_centaur2_ratio_cost = mean(ratio_centaur2_cost),
            se_centaur2_ratio_cost = sd(ratio_centaur2_cost))

df_costs_summ_long <- df_costs_summ %>%
  pivot_longer(
    cols = -k,
    names_to = "variable",
    values_to = "value"
  ) %>%
  mutate(
    sample_type = str_extract(variable, "human|centaur1|centaur2"),
    metric = case_when(
      str_detect(variable, "m_.*_quality") ~ "m_quality",
      str_detect(variable, "se_.*_quality") ~ "se_quality",
      str_detect(variable, "m_.*_cost") ~ "m_cost",
      str_detect(variable, "se_.*_cost") ~ "se_cost"
    )
  ) %>%
  select(-variable) %>%
  pivot_wider(
    names_from = metric,
    values_from = value
  ) %>%
  mutate(sample_type = toTitleCase(sample_type))


df_costs_summ_long %>%
  ggplot(aes(x = m_quality, y = m_cost, color = sample_type, shape = sample_type)) +
  geom_point(size=3, alpha = .5) +  # Add points
  geom_line(alpha = .6) +  # Connect points with lines
  labs(x="Quality Ratio", y="Cost Ratio",
       color = "Sample Type", shape = "Sample Type") +
  theme_minimal() +
  geom_vline(xintercept = 1, linetype = "dotted") +
  theme(text = element_text(size = 15),
        legend.position="bottom") +
  scale_color_manual(values = my_colors)

Supplementary analysis 1

Here, we perform a supplementary analysis to investigate list-wise variation in the correlation.

Checking individual lists for supplementary analysis:

df_results_summ = df_results_long %>%
  filter(sample_type == "Human") %>%
  group_by(k, sample_type, list_num) %>%
  summarize(m_corr = mean(correlation),
            sd_corr = sd(correlation),
            se_corr = sd(correlation)/sqrt(n()))  %>%
  mutate(sample_type = toTitleCase(sample_type)) 
## `summarise()` has grouped output by 'k', 'sample_type'. You can override using
## the `.groups` argument.
df_results_summ %>%
  ggplot(aes(x = k, y = m_corr)) +
  geom_point(aes(color=factor(list_num)), alpha = .5, size = 2) +  # Add points
  geom_line(aes(color=factor(list_num))) +  # Connect points with lines
  labs(x="Number of Participants", y="Correlation with Valence Norms", color = "List", shape = "List") +
  theme_minimal() +
  geom_hline(yintercept = mean(df_llm$spearman_llm), ### LLM 
              linetype = "dotted", color = "blue",
            alpha = .8) +
  theme(text = element_text(size = 15),
        legend.position="bottom") 

We also ask whether list-wise variation in the valence norms correlates with list-wise variation in the LLM correlations.

df_results_summ_list = df_results_summ %>%
  group_by(list_num) %>%
  summarise(max_corr_human = max(m_corr),
            min_corr_human = min(m_corr),
            mean_corr_human = mean(m_corr)) %>%
  inner_join(df_llm)
## Joining, by = "list_num"
df_results_summ_list %>%
  ggplot(aes(x = spearman_llm,
             y = mean_corr_human)) +
  geom_point(size = 4, alpha = .4) +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(x = "List-wise LLM Correlation",
       y = "List-wise Human Sample Correlation")
## `geom_smooth()` using formula 'y ~ x'

cor.test(df_results_summ_list$spearman_llm, df_results_summ_list$mean_corr_human)
## 
##  Pearson's product-moment correlation
## 
## data:  df_results_summ_list$spearman_llm and df_results_summ_list$mean_corr_human
## t = 2.5354, df = 15, p-value = 0.02285
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.09102517 0.81405100
## sample estimates:
##       cor 
## 0.5477073
cor.test(df_results_summ_list$spearman_llm, df_results_summ_list$mean_corr_human)
## 
##  Pearson's product-moment correlation
## 
## data:  df_results_summ_list$spearman_llm and df_results_summ_list$mean_corr_human
## t = 2.5354, df = 15, p-value = 0.02285
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.09102517 0.81405100
## sample estimates:
##       cor 
## 0.5477073

Finally, we ask whether list-wise variation in either measure can be predicted by list-wise variation in the words themselves.

# Set your working directory
folder_path <- "../../experiment/stimuli/glasgow_lists/"

# Create a list of CSV file paths
csv_files <- list.files(path = folder_path, pattern = "\\.csv$", full.names = TRUE)

# Read and combine the files
combined_df <- map_df(csv_files, read_csv)
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 55 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, display
## dbl (1): list_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
table(combined_df$list_number)
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 
## 51 51 51 51 51 51 51 51 51 51 51 51 51 51 51 51 55
# Now merge with actual Glasgow norms
df_values = read_csv("../../data/official/human/glasgow.csv") %>%
  select(word, Valence.M, Valence.SD, Valence.N, Length) %>%
  inner_join(combined_df) %>%
  mutate(list_num = list_number) %>%
  group_by(list_num) %>%
  summarise(mean_valence = mean(Valence.M),
            mean_valence_sd = mean(Valence.SD),
            sd_valence = sd(Valence.M),
            mean_length = mean(Length))
## Rows: 871 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): word
## dbl (28): Length, Arousal.M, Arousal.SD, Arousal.N, Valence.M, Valence.SD, V...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = "word"
### Double-check observations per list
nrow(df_values)
## [1] 17
table(df_values$list_num)
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 
##  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
### Now merge with list-wise variation
df_merged = df_values %>%
  left_join(df_results_summ_list)
## Joining, by = "list_num"
nrow(df_merged)
## [1] 17
### Does either the average or SD in valence predict variation?
mod = lm(data = df_merged, mean_corr_human ~ mean_valence + sd_valence + mean_valence_sd)
summary(mod)
## 
## Call:
## lm(formula = mean_corr_human ~ mean_valence + sd_valence + mean_valence_sd, 
##     data = df_merged)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18211 -0.02600  0.01709  0.07061  0.10135 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)
## (Intercept)     -0.28377    1.15217  -0.246    0.809
## mean_valence     0.08419    0.15830   0.532    0.604
## sd_valence       0.25301    0.27836   0.909    0.380
## mean_valence_sd  0.19285    0.43516   0.443    0.665
## 
## Residual standard error: 0.09064 on 13 degrees of freedom
## Multiple R-squared:  0.07604,    Adjusted R-squared:  -0.1372 
## F-statistic: 0.3566 on 3 and 13 DF,  p-value: 0.7852
### No, but the average LLM correlation per list does.
mod = lm(data = df_merged, mean_corr_human ~ spearman_llm)
summary(mod)
## 
## Call:
## lm(formula = mean_corr_human ~ spearman_llm, data = df_merged)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.136535 -0.042413  0.009058  0.026781  0.102938 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    0.1880     0.2037   0.923   0.3708  
## spearman_llm   0.6865     0.2708   2.535   0.0229 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07345 on 15 degrees of freedom
## Multiple R-squared:    0.3,  Adjusted R-squared:  0.2533 
## F-statistic: 6.428 on 1 and 15 DF,  p-value: 0.02285

Iconicity Norms

Load data

### setwd("/Users/seantrott/Dropbox/UCSD/Research/NLMs/llm_clt/src/analysis/")

### Read in all data
df_all_results = read_csv("../../data/processed/iconicity_results.csv")
## Rows: 61240 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): k, combo_index, list_num, n, pearson_centaur1, spearman_centaur1, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(df_all_results)
## [1] 61240
### How many per list?
table(df_all_results$list_num)
## 
##    1    2    3    4    5    6    7    8    9   10 
## 5753 6810 7340 5768 6278 5686 3735 6242 6289 7339
### LLM data
df_llm = read_csv("../../data/processed/iconicity_llm_corrs.csv")
## Rows: 10 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): list_num, n, pearson_llm, spearman_llm
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_llm %>%
  summarise(m_spearman = mean(spearman_llm),
            sd_spearman = sd(spearman_llm))
## # A tibble: 1 × 2
##   m_spearman sd_spearman
##        <dbl>       <dbl>
## 1      0.595       0.103

Figure 1j

df_individuals = df_all_results %>%
  filter(k == 1)

mean(df_individuals$spearman_ppt)
## [1] 0.3673823
sd(df_individuals$spearman_ppt)
## [1] 0.1319105
df_individuals %>%
  ggplot(aes(x = spearman_ppt)) +
  geom_histogram(alpha = .5) +
  labs(x="Correlation with Original Iconicity Norms", y="Count") +
  theme_minimal() +
  geom_vline(xintercept = mean(df_llm$spearman_llm), ### LLM 
              linetype = "dotted", color = "blue", alpha = .8) +
  theme(text = element_text(size = 15))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Figure 1k

### Pivot
df_results_long = df_all_results %>%
  select('spearman_centaur1', 'spearman_centaur2', 'spearman_ppt',
         'k', 'combo_index', 'list_num') %>%
  pivot_longer(cols = c('spearman_centaur1', 'spearman_centaur2', 'spearman_ppt'),
               names_to = "sample_type",
               values_to = "correlation") %>%
  mutate(sample_type = sub("spearman_", "", sample_type)) %>%
  mutate(sample_type = ifelse(
    sample_type == "ppt", "Human", sample_type
  ))


### Visualize
df_results_summ = df_results_long %>%
  group_by(k, sample_type) %>%
  summarize(m_corr = mean(correlation),
            sd_corr = sd(correlation),
            se_corr = sd(correlation)/sqrt(n()))  %>%
  mutate(sample_type = toTitleCase(sample_type)) 
## `summarise()` has grouped output by 'k'. You can override using the `.groups`
## argument.
df_results_summ %>%
  ggplot(aes(x = k, y = m_corr)) +
  geom_point(aes(color=factor(sample_type), shape = factor(sample_type)), size=3, alpha = .5) +  # Add points
  geom_line(aes(color=factor(sample_type))) +  # Connect points with lines
  geom_errorbar(aes(ymin=m_corr-se_corr * 2, ymax=m_corr+se_corr * 2, width=0.2, color = factor(sample_type))) + 
  labs(x="Number of Participants", y="Correlation with Iconicity Norms", color = "Sample Type", shape = "Sample Type") +
  theme_minimal() +
  geom_hline(yintercept = mean(df_llm$spearman_llm), ### LLM 
              linetype = "dotted", color = "blue",
            alpha = .8) +
  theme(text = element_text(size = 15),
        legend.position="bottom") +
scale_color_manual(values = my_colors)

Figure 1l

Here, we visualize projected differences in quality and cost of a human sample vs. GPT-4.

Setting up cost assumptions

For human cost, we assume:

  • Rate of $12 an hour.
  • Approximately 5 seconds per judgment.
  • Approximately ~720 judgments per hour.

For GPT-4 cost, we assume:

  • $0.06 per 1000 generated tokens.
  • $0.03 per 1000 sampled tokens.
  • Approximately 20 sampled tokens per judgment.
  • Approximately 10 generated tokens per judgment.
### Human assumptions
RATE = 12
SECONDS_PER_JUDGMENT = 5

HUMAN_CPJ = RATE / (3600/SECONDS_PER_JUDGMENT)

### GPT-4 assumptions
COST_PER_1K_SAMPLED = 0.0003
COST_PER_1K_GENERATED = 0.0006
NUM_SAMPLED_PER_JUDGMENT = 20
NUM_GENERATED_PER_JUDGMENT = 10

GPT_CPJ = (NUM_SAMPLED_PER_JUDGMENT / 1000) * COST_PER_1K_SAMPLED + 1000 * (NUM_GENERATED_PER_JUDGMENT / 1000) * COST_PER_1K_GENERATED

Visualizing

### Visualize

df_costs_summ = df_all_results %>%
  mutate(ratio_human_quality = spearman_ppt / spearman_llm) %>%
  mutate(ratio_human_cost = (k * HUMAN_CPJ) / GPT_CPJ) %>%
  mutate(ratio_centaur1_quality = spearman_centaur1 / spearman_llm) %>%
  mutate(ratio_centaur1_cost = (GPT_CPJ + k * HUMAN_CPJ) / GPT_CPJ) %>%
  mutate(ratio_centaur2_quality = spearman_centaur2 / spearman_llm) %>%
  mutate(ratio_centaur2_cost = (GPT_CPJ + k * HUMAN_CPJ) / GPT_CPJ) %>%
  group_by(k) %>%
  summarise(m_human_ratio_quality = mean(ratio_human_quality),
            se_human_ratio_quality = sd(ratio_human_quality)/sqrt(n()),
            m_human_ratio_cost = mean(ratio_human_cost),
            se_human_ratio_cost = sd(ratio_human_cost),
            ### Centaur1
            m_centaur1_ratio_quality = mean(ratio_centaur1_quality),
            se_centaur1_ratio_quality = sd(ratio_centaur1_quality)/sqrt(n()),
            m_centaur1_ratio_cost = mean(ratio_centaur1_cost),
            se_centaur1_ratio_cost = sd(ratio_centaur1_cost),
            ### Centaur2
            m_centaur2_ratio_quality = mean(ratio_centaur2_quality),
            se_centaur2_ratio_quality = sd(ratio_centaur2_quality)/sqrt(n()),
            m_centaur2_ratio_cost = mean(ratio_centaur2_cost),
            se_centaur2_ratio_cost = sd(ratio_centaur2_cost))

df_costs_summ_long <- df_costs_summ %>%
  pivot_longer(
    cols = -k,
    names_to = "variable",
    values_to = "value"
  ) %>%
  mutate(
    sample_type = str_extract(variable, "human|centaur1|centaur2"),
    metric = case_when(
      str_detect(variable, "m_.*_quality") ~ "m_quality",
      str_detect(variable, "se_.*_quality") ~ "se_quality",
      str_detect(variable, "m_.*_cost") ~ "m_cost",
      str_detect(variable, "se_.*_cost") ~ "se_cost"
    )
  ) %>%
  select(-variable) %>%
  pivot_wider(
    names_from = metric,
    values_from = value
  ) %>%
  mutate(sample_type = toTitleCase(sample_type))


df_costs_summ_long %>%
  ggplot(aes(x = m_quality, y = m_cost, color = sample_type, shape = sample_type)) +
  geom_point(size=3, alpha = .5) +  # Add points
  geom_line(alpha = .6) +  # Connect points with lines
  labs(x="Quality Ratio", y="Cost Ratio",
       color = "Sample Type", shape = "Sample Type") +
  theme_minimal() +
  geom_vline(xintercept = 1, linetype = "dotted") +
  theme(text = element_text(size = 15),
        legend.position="bottom") +
  scale_color_manual(values = my_colors)

Supplementary analysis 1

Checking individual lists for supplementary analysis:

df_results_summ = df_results_long %>%
  filter(sample_type == "Human") %>%
  group_by(k, sample_type, list_num) %>%
  summarize(m_corr = mean(correlation),
            sd_corr = sd(correlation),
            se_corr = sd(correlation)/sqrt(n()))  %>%
  mutate(sample_type = toTitleCase(sample_type)) 
## `summarise()` has grouped output by 'k', 'sample_type'. You can override using
## the `.groups` argument.
df_results_summ %>%
  ggplot(aes(x = k, y = m_corr)) +
  geom_point(aes(color=factor(list_num)), alpha = .5, size = 2) +  # Add points
  geom_line(aes(color=factor(list_num))) +  # Connect points with lines
  labs(x="Number of Participants", y="Correlation with Iconicity Norms", color = "List", shape = "List") +
  theme_minimal() +
  geom_hline(yintercept = mean(df_llm$spearman_llm), ### LLM 
              linetype = "dotted", color = "blue",
            alpha = .8) +
  theme(text = element_text(size = 15),
        legend.position="bottom") 

Supplementary analysis 2

We also recalculate NNB within each list to ensure that it doesn’t depend hugely on this list-wise variation.

df_results_summ = df_results_summ %>%
  left_join(df_llm) %>%
  mutate(llm_diff = m_corr - spearman_llm)
## Joining, by = "list_num"
df_results_summ %>%
  ggplot(aes(x = k, y = llm_diff)) +
  geom_point(aes(color=factor(list_num)), size=2, alpha = .5) +  # Add points
  geom_line(aes(color=factor(list_num))) +  # Connect points with lines
  labs(x="Number of Participants", y="Difference (Human - GPT-4)", color = "Sample Type", shape = "Sample Type") +
  theme_minimal() +
  geom_hline(yintercept = 0, ### LLM 
              linetype = "dotted", color = "blue",
            alpha = .8) +
  theme(text = element_text(size = 15),
        legend.position="bottom")